This section and the next are relevant for reproducibility purposes. For results, please skip to section 3 (Quality Control)
suppressPackageStartupMessages({
library(data.table)
library(DESeq2)
library(emoji)
library(gplots)
library(gtools)
library(here)
library(hyperSpec)
library(limma)
library(magrittr)
library(parallel)
library(patchwork)
library(PCAtools)
library(pheatmap)
library(plotly)
library(pvclust)
library(RColorBrewer)
library(tidyverse)
library(tximport)
library(vsn)
})tx2gene <- suppressMessages(read_delim(here("reference/annotation/tx2gene.tsv.gz"),delim="\t",
col_names=c("TXID","GENE")))filelist <- list.files(here("analysis/salmon"),
recursive = TRUE,
pattern = "quant.sf",
full.names = TRUE)stopifnot(all(match(sub("_1_sort.*","",basename(dirname(filelist))),
samples$SampleID) == 1:nrow(samples)))Read the expression at the gene level
txi <- suppressMessages(tximport(files = samples$Filenames,
type = "salmon",
tx2gene=tx2gene))
counts <- txi$counts
colnames(counts) <- samples$SampleID
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))## [1] "20.3% percent (7515) of 37075 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples)
ggplot(dat,aes(x,y,fill=Genotype)) +
geom_col() +
scale_y_continuous(name="reads") +
facet_grid(~ factor(Genotype), scales = "free") +
theme_bw() +
theme(axis.text.x=element_text(angle=90,size=4),
axis.title.x=element_blank())👉 We observe a lot of difference in the raw sequencing depth, but we know some are tech replicates.
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density(na.rm = TRUE) +
ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)") +
theme_bw()👉 The cumulative gene coverage is as expected, despite the lower coverage of some samples
dat <- as.data.frame(log10(counts)) %>%
utils::stack() %>%
mutate(Genotype=samples$Genotype[match(ind,samples$SampleID)],
TechRep=samples$TechRep[match(ind,samples$SampleID)])
ggplot(dat,aes(x=values,group=ind,col=Genotype)) +
geom_density(na.rm = TRUE) +
ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
theme_bw()ggplot(dat,aes(x=values,group=ind,col=TechRep)) +
geom_density(na.rm = TRUE) +
ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
theme_bw()👉 Samples have different sequencing depth that is in agreement with the presence of technical replicates
dir.create(here("data/analysis/salmon"),showWarnings=FALSE,recursive=TRUE)
write.csv(counts,file=here("data/analysis/salmon/raw-unormalised-gene-expression_data.csv"))
For visualization, the data is submitted to a variance stabilization transformation using DESeq2. The dispersion is estimated independently of the sample tissue and replicate.
## using counts and average transcript lengths from tximport
(i.e. the sequencing library size effect)
dds <- estimateSizeFactors(dds) %>%
suppressMessages()
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y")
abline(h=1, col = "Red", lty = 3)and without outliers:
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)👉 Seven samples had been clearly under-sequenced.
Assess whether there might be a difference in library size linked to a given metadata
boxplot(split(t(normalizationFactors(dds)),dds$Genotype),las=2,
main="Sequencing libraries size factor by Genotype",
outline=FALSE)👉 The scaling factor distribution is similar for all genotypes with line3 having less lowly sequenced technical replicates.
plot(colMeans(normalizationFactors(dds)),
log10(colSums(counts(dds))),ylab="log10 raw depth",
xlab="average scaling factor",
col=rainbow(n=nlevels(dds$Genotype))[as.integer(dds$Genotype)],pch=19)
legend("bottomright",fill=rainbow(n=nlevels(dds$Genotype)),
legend=levels(dds$Genotype),cex=0.6)👉 The scaling factor appears linear for the samples with sufficient sequencing depth. Otherwise, not unintuitively, the distribution is logarithmic
let’s look at standard deviations before and after VST normalization. This plot is to see whether there is a dependency of SD on the mean.
Before:
After:
After VST normalization, the red line is almost horizontal which indicates no dependency of variance on mean (homoscedastic).
👉 We can conclude that the variance stabilization worked adequately
Using PCAtools
We define the number of variable of the model:
An the number of possible combinations
We devise the optimal number of components using two methods
We plot the percentage explained by different components and try to empirically assess whether the observed number of components would be in agreement with our model’s assumptions.
ggplot(tibble(x=1:length(percent),y=cumsum(percent),p=percent),aes(x=x,y=y)) +
geom_line() + geom_col(aes(x,p)) + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component",breaks=1:length(percent),minor_breaks=NULL) +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",linewidth=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",linewidth=0.5) +
geom_vline(xintercept=c(horn$n,elbow),colour="black",linetype="dotted",linewidth=0.5) +
geom_label(aes(x = horn$n + 1, y = cumsum(percent)[horn$n],label = 'Horn', vjust = 1)) +
geom_label(aes(x = elbow + 1, y = cumsum(percent)[elbow],label = 'Elbow', vjust = 1))## Warning: Removed 7 rows containing missing values (`geom_line()`).
👉 The first component explains 50% of the data variance. Both metrics, Horn and Elbow suggest that three or four components are those that are informative. Indeed the slope of the curve is fairly linear past PC3 and that would indicate that the remaining PCs only capture sample specific noise. While this is only empirical, the scree plot support having only few variables of importance in the dataset.
p1 <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Genotype,shape=BioRep,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p1) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep=""))) %>% suppressWarnings()The same as a biplot
biplot(p,lab=samples$BioRep,
colby = 'Genotype',
colLegendTitle = 'Genotype',
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)## Registered S3 methods overwritten by 'ggalt':
## method from
## grid.draw.absoluteGrob ggplot2
## grobHeight.absoluteGrob ggplot2
## grobWidth.absoluteGrob ggplot2
## grobX.absoluteGrob ggplot2
## grobY.absoluteGrob ggplot2
👉 The first dimension separates the lines from WT (T89). The second dimension separates the lines. The technical replicates cluster close together.
p2 <- ggplot(pc.dat,aes(x=PC1,y=PC3,col=Genotype,shape=BioRep,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p2) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC3 (",percent[3],"%)",sep=""))) %>% suppressWarnings()The same as a biplot
biplot(p,x = 'PC1', y = 'PC3',
lab = samples$BioRep,
colby = 'Genotype',
colLegendTitle = 'Genotype',
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)👉 Interestingly, the third dimension separates biological replicates from line3. Note that this only explains 8% of the data variance.
This allows for looking at more dimensions, five by default
👉 PC1 and PC2 are explained by the genotypes. PC3 to PC5, each explain the variance within one genotype.
Loadings, i.e. the individual effect of every gene in a component can be studied. Here the most important ones are visualized throughout the different PCs
plotloadings(p,
rangeRetain = 0.01,
labSize = 4.0,
title = 'Loadings plot',
subtitle = 'PC1 to PC5',
caption = 'Top 1% variables',
drawConnectors = TRUE)## -- variables retained:
## Potra2n19c34179, Potra2n10c21157, Potra2n8c17993, Potra2n3c6790, Potra2n5c10698, Potra2n18c32926, Potra2n317s35301, Potra2n19c33524, Potra2n16c30110
👉 Among the nine genes selected that have the most important loading to the first five PCs, none of them is in the list of gene of interest. It might be interesting to look these genes up.
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- colnames(sampleDistMatrix) <- paste(dds$Genotype,dds$BioRep)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=pal)👉 The samples group be genotype. Not all the technical replicates group as expected, but that is not unexpected given the shallow sequencing depth of some of these.
The figures show the number of genes expressed per condition at different expression cutoffs. The scale on the lower plot is the same as on the upper. The first plot is a heatmap showing the number of genes above a given cutoff. The second plot shows it as a ratio of the number of genes expressed for (a) given variable(s) divided by the average number of genes expressed at that cutoff across all variable(s). The latter plot is of course biased at higher cutoff as the number of genes becomes smaller and smaller. The point of these two plots is to assert whether the number of genes expressed varies between conditions, as this would break some assumptions for normalisation and differential expression.
👉 There is almost no difference in the number of genes expressed by the different genotypes.
Plotting the number of genes that are expressed (at any level)
do.call(rbind,split(t(nrow(vst) - colSums(vst==0)),samples$Genotype)) %>% as.data.frame() %>%
rownames_to_column("Genotype") %>% pivot_longer(starts_with("V")) %>%
ggplot(aes(x=Genotype, y=value, fill=Genotype)) + geom_dotplot(binaxis = "y", stackdir = "center")## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
👉 Here the results are clearly affected by the lower sequencing of some of the samples.
Here we want to visualise all the informative genes as a heatmap. We first filter the genes to remove those below the selected noise/signal cutoff. The method employed here is naive, and relies on observing a sharp decrease in the number of genes within the first few low level of expression. Using an independent filtering method, such as implemented in DESeq2 would be more accurate, but for the purpose of QA validation, a naive approach is sufficient. Note that a sweet spot for computation is around 20 thousand genes, as building the hierarchical clustering for the heatmap scales non-linearly.
👉 Here a cutoff of 1 is applied, as a naive way to separate the signal from the noise.
vst.cutoff <- 1
nn <- nrow(vst[sels[[vst.cutoff+1]],])
tn <- nrow(vst)
pn <- round(nn * 100/ tn, digits=1)⚠️ 63.8% (23636) of total 37075 genes are plotted below:
mat <- t(scale(t(vst[sels[[vst.cutoff+1]],])))
hm <- pheatmap(mat,
color = hpal,
border_color = NA,
clustering_distance_rows = "correlation",
clustering_method = "ward.D2",
show_rownames = FALSE,
labels_col = conds,
angle_col = 90,
legend = FALSE)👉 The heatmap shows the expected clustering.
Below we assess the previous dendrogram’s reproducibility and plot the clusters with au and bp where:
⚠️Clusters with AU larger than 95% are highlighted by rectangles, which are strongly supported by data
hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
method.hclust = "ward.D2",
nboot = 100, parallel = TRUE)## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.
👉 The clustering results shows after bootstrapping that the technical replicates are all as they should, closest with each-other.
##
## Cluster method: ward.D2
## Distance : correlation
##
## Estimates on edges:
##
## si au bp se.si se.au se.bp v c pchi
## 1 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 2 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 3 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 4 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 5 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 6 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 7 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 8 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 9 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 10 0.996 0.999 0.985 0.016 0.007 0.007 -2.581 0.408 0.853
## 11 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 12 0.996 0.999 0.985 0.016 0.007 0.007 -2.581 0.408 0.853
## 13 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 14 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 15 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 16 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 17 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 18 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 19 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
## 20 1.000 1.000 1.000 0.000 0.000 0.000 0.000 0.000 0.000
samples$BioID <- paste0(samples$Genotype,"_",samples$BioRep)
txi$counts <- sapply(split.data.frame(t(txi$counts),samples$BioID),colSums)
txi$length <- sapply(split.data.frame(t(txi$length),samples$BioID),colMaxs)
samples <- samples[match(colnames(txi$counts),samples$BioID),]
counts <- txi$counts## using counts and average transcript lengths from tximport
sel <- rowSums(counts) == 0
sprintf("%s%% percent (%s) of %s genes are not expressed",
round(sum(sel) * 100/ nrow(counts),digits=1),
sum(sel),
nrow(counts))## [1] "20.3% percent (7515) of 37075 genes are not expressed"
dat <- tibble(x=colnames(counts),y=colSums(counts)) %>%
bind_cols(samples)
ggplot(dat,aes(x,y,fill=Genotype)) +
geom_col() +
scale_y_continuous(name="reads") +
facet_grid(~ factor(Genotype), scales = "free") +
theme_bw() +
theme(axis.text.x=element_text(angle=90,size=4),
axis.title.x=element_blank())👉 The raw sequencing depth is now very similar between replicates.
i.e. the mean raw count of every gene across samples is calculated and displayed on a log10 scale.
ggplot(data.frame(value=log10(rowMeans(counts))),aes(x=value)) +
geom_density(na.rm = TRUE) +
ggtitle("gene mean raw counts distribution") +
scale_x_continuous(name="mean raw counts (log10)") +
theme_bw()👉 The cumulative gene coverage is as expected, the great sequencing depth means that we have a broad left shoulder, likely indicative of an elevated signal-to-noise ratio.
dat <- as.data.frame(log10(counts)) %>%
utils::stack() %>%
mutate(Genotype=samples$Genotype[match(ind,samples$BioID)])
ggplot(dat,aes(x=values,group=ind,col=Genotype)) +
geom_density(na.rm = TRUE) +
ggtitle("sample raw counts distribution") +
scale_x_continuous(name="per gene raw counts (log10)") +
theme_bw()👉 All samples have a similar profile.
dds <- estimateSizeFactors(dds) %>%
suppressMessages()
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y")
abline(h=1, col = "Red", lty = 3)and without outliers:
boxplot(normalizationFactors(dds),
main="Sequencing libraries size factor",
las=2,log="y", outline=FALSE)
abline(h=1, col = "Red", lty = 3)👉 The difference in sequencing depth varies between -15/+25%, which is little.
Assess whether there might be a difference in library size linked to a given metadata
boxplot(split(t(normalizationFactors(dds)),dds$Genotype),las=2,
main="Sequencing libraries size factor by Genotype",
outline=FALSE)👉 The scaling factor distribution varies between genotypes, possibly indicative that different number of genes are expressed in the different genotypes.
plot(colMeans(normalizationFactors(dds)),
log10(colSums(counts(dds))),ylab="log10 raw depth",
xlab="average scaling factor",
col=rainbow(n=nlevels(dds$Genotype))[as.integer(dds$Genotype)],pch=19)
legend("bottomright",fill=rainbow(n=nlevels(dds$Genotype)),
legend=levels(dds$Genotype),cex=0.6)👉 If we assume a linear relationship across the “line” samples, then T89 would seem to have a lower scaling factor for a higher depth, a sign that the number of genes expressed in T89 (WT) would be different from than in the mutated lines.
Using PCAtools
We devise the optimal number of components using two methods
We plot the percentage explained by different components and try to empirically assess whether the observed number of components would be in agreement with our model’s assumptions.
ggplot(tibble(x=1:length(percent),y=cumsum(percent),p=percent),aes(x=x,y=y)) +
geom_line() + geom_col(aes(x,p)) + scale_y_continuous("variance explained (%)",limits=c(0,100)) +
scale_x_continuous("Principal component",breaks=1:length(percent),minor_breaks=NULL) +
geom_vline(xintercept=nvar,colour="red",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nvar],colour="red",linetype="dashed",linewidth=0.5) +
geom_vline(xintercept=nlevel,colour="orange",linetype="dashed",linewidth=0.5) +
geom_hline(yintercept=cumsum(percent)[nlevel],colour="orange",linetype="dashed",linewidth=0.5) +
geom_vline(xintercept=c(horn$n,elbow),colour="black",linetype="dotted",linewidth=0.5) +
geom_label(aes(x = horn$n + 1, y = cumsum(percent)[horn$n],label = 'Horn', vjust = 1)) +
geom_label(aes(x = elbow + 1, y = cumsum(percent)[elbow],label = 'Elbow', vjust = 1))## Warning: Removed 2 rows containing missing values (`geom_line()`).
👉 The first component explains 53% of the data variance. Both metrics, Horn and Elbow suggest that two or three components are those that are informative. Indeed the slope of the curve is fairly linear past PC3 and that would indicate that the remaining PCs only capture sample specific noise. While this is only empirical, the scree plot support having only few variables of importance in the dataset.
p1 <- ggplot(pc.dat,aes(x=PC1,y=PC2,col=Genotype,shape=BioRep,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p1) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC2 (",percent[2],"%)",sep=""))) %>% suppressWarnings()The same as a biplot
biplot(p,lab=samples$BioRep,
colby = 'Genotype',
colLegendTitle = 'Genotype',
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)👉 The first dimension separates the lines from WT (T89). The second dimension separates the lines. The technical replicates cluster close together.
p2 <- ggplot(pc.dat,aes(x=PC1,y=PC3,col=Genotype,shape=BioRep,text=SampleID)) +
geom_point(size=2) +
ggtitle("Principal Component Analysis",subtitle="variance stabilized counts")
ggplotly(p2) %>%
layout(xaxis=list(title=paste("PC1 (",percent[1],"%)",sep="")),
yaxis=list(title=paste("PC3 (",percent[3],"%)",sep=""))) %>% suppressWarnings()The same as a biplot
biplot(p,x = 'PC1', y = 'PC3',
lab = samples$BioRep,
colby = 'Genotype',
colLegendTitle = 'Genotype',
encircle = TRUE,
encircleFill = TRUE,
legendPosition = 'top',
legendLabSize = 16, legendIconSize = 8.0)👉 Interestingly, the third dimension separates biological replicates from line3. Note that this only explains 8.52% of the data variance.
This allows for looking at more dimensions, five by default
👉 PC1 and PC2 are explained by the genotypes. PC3 to PC4, each explain the variance within one “line” genotype. PC5 is harder to explain, but seem influenced by some replicates.
Loadings, i.e. the individual effect of every gene in a component can be studied. Here the most important ones are visualized throughout the different PCs
plotloadings(p,
rangeRetain = 0.01,
labSize = 4.0,
title = 'Loadings plot',
subtitle = 'PC1 to PC5',
caption = 'Top 1% variables',
drawConnectors = TRUE)## -- variables retained:
## Potra2n19c34179, Potra2n3c6790, Potra2n8c17993, Potra2n5c10698, Potra2n8c18373, Potra2n7c15576, Potra2n7c15584, Potra2n16c30110
👉 Among the eight genes selected that have the most important loading to the first five PCs, none of them is in the list of gene of interest. It might be interesting to look these genes up.
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- colnames(sampleDistMatrix) <- paste(dds$Genotype,dds$BioRep)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=pal)👉 The samples group be genotype.
The figures show the number of genes expressed per condition at different expression cutoffs. The scale on the lower plot is the same as on the upper. The first plot is a heatmap showing the number of genes above a given cutoff. The second plot shows it as a ratio of the number of genes expressed for (a) given variable(s) divided by the average number of genes expressed at that cutoff across all variable(s). The latter plot is of course biased at higher cutoff as the number of genes becomes smaller and smaller. The point of these two plots is to assert whether the number of genes expressed varies between conditions, as this would break some assumptions for normalisation and differential expression.
👉 There is almost no difference in the number of genes expressed by the different genotypes at lower level of expression, but T89 has more genes expressed and at a highly level of expression.
Plotting the number of genes that are expressed (at any level)
do.call(rbind,split(t(nrow(vst) - colSums(vst==0)),samples$Genotype)) %>% as.data.frame() %>%
rownames_to_column("Genotype") %>% pivot_longer(starts_with("V")) %>%
ggplot(aes(x=Genotype, y=value, fill=Genotype)) + geom_dotplot(binaxis = "y", stackdir = "center")## Bin width defaults to 1/30 of the range of the data. Pick better value with
## `binwidth`.
👉 This is interesting, overall T89 has less genes expressed but these are expressed at a higher level.
Here we want to visualise all the informative genes as a heatmap. We first filter the genes to remove those below the selected noise/signal cutoff. The method employed here is naive, and relies on observing a sharp decrease in the number of genes within the first few low level of expression. Using an independent filtering method, such as implemented in DESeq2 would be more accurate, but for the purpose of QA validation, a naive approach is sufficient. Note that a sweet spot for computation is around 20 thousand genes, as building the hierarchical clustering for the heatmap scales non-linearly.
👉 Here a cutoff of 1 is applied, as a naive way to separate the signal from the noise.
vst.cutoff <- 1
nn <- nrow(vst[sels[[vst.cutoff+1]],])
tn <- nrow(vst)
pn <- round(nn * 100/ tn, digits=1)⚠️ 64.5% (23899) of total 37075 genes are plotted below:
mat <- t(scale(t(vst[sels[[vst.cutoff+1]],])))
hm <- pheatmap(mat,
color = hpal,
border_color = NA,
clustering_distance_rows = "correlation",
clustering_method = "ward.D2",
show_rownames = FALSE,
labels_col = conds,
angle_col = 90,
legend = FALSE)👉 The heatmap shows the expected clustering and a lot of potential for differential expression.
Below we assess the previous dendrogram’s reproducibility and plot the clusters with au and bp where:
⚠️Clusters with AU larger than 95% are highlighted by rectangles, which are strongly supported by data
hm.pvclust <- pvclust(data = t(scale(t(vst[sels[[vst.cutoff+1]],]))),
method.hclust = "ward.D2",
nboot = 100, parallel = TRUE)## Creating a temporary cluster...done:
## socket cluster with 15 nodes on host 'localhost'
## Multiscale bootstrap... Done.
⭐ The data is of good quality. The initial data had multiple technical replicates that were merged after an initial assessment.
⭐ The technical replicate merged data is also of good quality, with all assessment plots separating the genotypes and grouping the replicates as they should.
⭐ The wild-type T89 seem to have a less diversified transcriptome - less genes expressed - at a comparatively higher level. This might be an artifact of the normalization that expects the same number of genes to be expressed across all genotypes.
## R version 4.3.1 (2023-06-16)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 22.04.3 LTS
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.20.so; LAPACK version 3.10.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## time zone: Etc/UTC
## tzcode source: system (glibc)
##
## attached base packages:
## [1] parallel grid stats4 stats graphics grDevices utils
## [8] datasets methods base
##
## other attached packages:
## [1] vsn_3.68.0 tximport_1.28.0
## [3] lubridate_1.9.3 forcats_1.0.0
## [5] stringr_1.5.0 dplyr_1.1.3
## [7] purrr_1.0.2 readr_2.1.4
## [9] tidyr_1.3.0 tibble_3.2.1
## [11] tidyverse_2.0.0 RColorBrewer_1.1-3
## [13] pvclust_2.2-0 plotly_4.10.2
## [15] pheatmap_1.0.12 PCAtools_2.12.0
## [17] ggrepel_0.9.3 patchwork_1.1.3
## [19] magrittr_2.0.3 limma_3.56.2
## [21] hyperSpec_0.100.0 xml2_1.3.5
## [23] ggplot2_3.4.3 lattice_0.21-8
## [25] here_1.0.1 gtools_3.9.4
## [27] gplots_3.1.3 emoji_15.0
## [29] DESeq2_1.40.2 SummarizedExperiment_1.30.2
## [31] Biobase_2.60.0 MatrixGenerics_1.12.3
## [33] matrixStats_1.0.0 GenomicRanges_1.52.0
## [35] GenomeInfoDb_1.36.3 IRanges_2.34.1
## [37] S4Vectors_0.38.2 BiocGenerics_0.46.0
## [39] data.table_1.14.8
##
## loaded via a namespace (and not attached):
## [1] rstudioapi_0.15.0 jsonlite_1.8.7
## [3] farver_2.1.1 rmarkdown_2.25
## [5] zlibbioc_1.46.0 vctrs_0.6.3
## [7] DelayedMatrixStats_1.22.6 RCurl_1.98-1.12
## [9] htmltools_0.5.6 S4Arrays_1.0.6
## [11] proj4_1.0-13 sass_0.4.7
## [13] KernSmooth_2.23-22 bslib_0.5.1
## [15] htmlwidgets_1.6.2 plyr_1.8.9
## [17] testthat_3.1.10 cachem_1.0.8
## [19] ggalt_0.4.0 lifecycle_1.0.3
## [21] pkgconfig_2.0.3 rsvd_1.0.5
## [23] Matrix_1.6-0 R6_2.5.1
## [25] fastmap_1.1.1 GenomeInfoDbData_1.2.10
## [27] digest_0.6.33 colorspace_2.1-0
## [29] rprojroot_2.0.3 dqrng_0.3.1
## [31] irlba_2.3.5.1 crosstalk_1.2.0
## [33] beachmat_2.16.0 labeling_0.4.3
## [35] fansi_1.0.4 timechange_0.2.0
## [37] httr_1.4.7 abind_1.4-5
## [39] compiler_4.3.1 bit64_4.0.5
## [41] withr_2.5.1 BiocParallel_1.34.2
## [43] hexbin_1.28.3 Rttf2pt1_1.3.12
## [45] maps_3.4.1 MASS_7.3-60
## [47] DelayedArray_0.26.7 caTools_1.18.2
## [49] tools_4.3.1 extrafontdb_1.0
## [51] glue_1.6.2 reshape2_1.4.4
## [53] generics_0.1.3 gtable_0.3.4
## [55] tzdb_0.4.0 preprocessCore_1.62.1
## [57] hms_1.1.3 BiocSingular_1.16.0
## [59] ScaledMatrix_1.8.1 utf8_1.2.3
## [61] XVector_0.40.0 pillar_1.9.0
## [63] vroom_1.6.4 bit_4.0.5
## [65] deldir_1.0-9 tidyselect_1.2.0
## [67] locfit_1.5-9.8 knitr_1.44
## [69] xfun_0.40 brio_1.1.3
## [71] stringi_1.7.12 lazyeval_0.2.2
## [73] yaml_2.3.7 evaluate_0.22
## [75] codetools_0.2-19 interp_1.1-4
## [77] extrafont_0.19 BiocManager_1.30.22
## [79] cli_3.6.1 affyio_1.70.0
## [81] ash_1.0-15 munsell_0.5.0
## [83] jquerylib_0.1.4 Rcpp_1.0.11
## [85] png_0.1-8 ellipsis_0.3.2
## [87] latticeExtra_0.6-30 jpeg_0.1-10
## [89] sparseMatrixStats_1.12.2 bitops_1.0-7
## [91] viridisLite_0.4.2 scales_1.2.1
## [93] affy_1.78.2 crayon_1.5.2
## [95] rlang_1.1.1 cowplot_1.1.1
Created by Nicolas Delhomme